data = read.csv('AB_NYC_2019.csv', na.strings = c("", "NA"))
dat=data
gg_miss_upset(dat)

nrow(dat)
## [1] 48895

availability

dat=data%>%mutate(avail_0=if_else(availability_365==0,TRUE,FALSE))
dat=dat%>%mutate(term=if_else(minimum_nights<=7,"short",if_else(minimum_nights<=45,"middle","long")))
#dat%>%filter(minimum_nights>3,availability_365==0,number_of_reviews==0)

ggplot(data=dat,aes(x=log(price),fill=avail_0))+geom_density(alpha=0.5)
## Warning: Removed 11 rows containing non-finite values (stat_density).

ggplot(data=dat,aes(x=log(1+number_of_reviews),fill=avail_0))+geom_density(alpha=0.5)

minimum nights

#ids=dat%>%filter(price==0)%>%pull(host_id)
#dat%>%filter(host_id%in%ids)
quantile(dat$minimum_nights,0.99)
## 99% 
##  45
mean(dat$minimum_nights<=30)
## [1] 0.9847224
dat%>%ggplot(data=.,aes(x=log(price),fill=term))+geom_density(alpha=0.5)
## Warning: Removed 11 rows containing non-finite values (stat_density).

dat%>%ggplot(data=.,aes(x=term,y=log(1+number_of_reviews)))+geom_boxplot(alpha=0.5)

dat%>%ggplot(data=.,aes(x=term,y=log(price)))+geom_boxplot(alpha=0.5)
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

(ii) heterogeneity across boroughs and neighborhood

order_nei=dat%>%group_by(neighbourhood_group,neighbourhood)%>%summarise()%>%pull(neighbourhood)%>%as.character()
dat=dat%>%mutate(neighbourhood=factor(neighbourhood,levels = order_nei))
dat%>%ggplot(data=.,aes(x=neighbourhood,y=log(price)))+geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

dat%>%ggplot(data=.,aes(x=neighbourhood_group,y=log(price)))+geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

dat%>%ggplot(data=.,aes(x=neighbourhood,fill=neighbourhood_group))+geom_bar()

dat%>%ggplot(data=.,aes(x=neighbourhood_group,fill=neighbourhood_group))+geom_bar()

dat%>%group_by(neighbourhood_group,neighbourhood)%>%summarise(count=n())%>%arrange(desc(count))%>%head(5)
## # A tibble: 5 x 3
## # Groups:   neighbourhood_group [2]
##   neighbourhood_group neighbourhood      count
##   <fct>               <fct>              <int>
## 1 Brooklyn            Williamsburg        3920
## 2 Brooklyn            Bedford-Stuyvesant  3714
## 3 Manhattan           Harlem              2658
## 4 Brooklyn            Bushwick            2465
## 5 Manhattan           Upper West Side     1971

room type

dat%>%ggplot(data=.,aes(x=room_type,y=log(price)))+geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

dat%>%ggplot(data=.)+geom_mosaic(aes(x=product(room_type,neighbourhood_group),fill=room_type))

maps

library("ggmap")
ny.map=get_map(location = c(left=-74.2445,right=-73.71298, bottom= 40.49975,top=40.9131),color = "bw",maptype = "toner",source = "stamen")
ggmap(ny.map)+
  stat_density2d(data = dat,
                 aes(x = longitude, y = latitude,fill = ..level.., alpha = ..level..), 
                 geom = "polygon") + 
  scale_fill_gradient(low = "green", high = "red") + 
  scale_alpha(range = c(0, 0.75), guide = FALSE)

img = readJPEG("New_York_City_.jpg")
jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))

ggplot(dat, aes(x=longitude, y = latitude, color = log(1+price)))+
  annotation_custom(rasterGrob(img, 
                               width = unit(1,"npc"), 
                               height = unit(1,"npc")), 
                    -74.258, -73.69, 40.49,40.92) + 
  geom_point(cex = 0.4,alpha=0.5) +
  scale_colour_gradientn(colors = jet.colors(7), limits = c(3,7)) 

ggplot(dat, aes(x=longitude, y = latitude, color = availability_365))+
  annotation_custom(rasterGrob(img, 
                               width = unit(1,"npc"), 
                               height = unit(1,"npc")), 
                    -74.258, -73.69, 40.49,40.92) + 
  geom_point(cex = 0.4,alpha=0.5) +
  scale_colour_gradient(low = 'red', high = 'grey') 

words

library(tidytext)
library("textdata")
words=dat$name%>%
  str_to_lower()%>%
  str_replace_all(.,"\\+|&|@|\\/|!|;|,"," ")%>%
  str_replace_all(.,"by|the|of|in|on|to","")%>%str_split(.," ")
words=map(words,~.x[.x!=""])
word_count=map_dbl(words,~length(.x))
dat=dat%>%mutate(wcount=word_count)
ggplot(data=dat,mapping = aes(x=word_count,y=log(price)))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).

ggplot(data=dat,mapping = aes(x=word_count,y=log(1+number_of_reviews)))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

library(wordcloud)
## Loading required package: RColorBrewer
names=dat$name%>%str_to_lower()%>%word()%>%str_replace_all(.,"\\+|@|\\/|!|;|,|\\*|\\(|\\)|:|-|_|¡|\\.|\\'|‘|’|\\'|\"|“|”|a|the","")
all_words=names[!names%in%stop_words]%>%table()
wordcloud(names(all_words),all_words,max.words = 100)

model

references:

http://www2.stat.duke.edu/~cr173/Sta444_Fa18/

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3074178/pdf/nihms237255.pdf

https://www.google.com/search?q=spatial+prior+hierachical+model+r+package&oq=spatial+prior+hierachical+model+r+package&aqs=chrome..69i57.16517j0j7&sourceid=chrome&ie=UTF-8

pkg: HSAR, spBayes, CARBayes